Author: Andrea Ierardi

 

Repository : Code link

 

Assignment

The exam consists in two assignments, one on the first part(regression, tree, neural nets) and the second part (unsupervised learning). For both you must prepare a writing report using one or more techniques and comparing their performance on one or more data set chosen by the student.

 

Libraries

library(knitr)
library(ggplot2)
library(plotly)
## 
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
## 
##     last_plot
## The following object is masked from 'package:stats':
## 
##     filter
## The following object is masked from 'package:graphics':
## 
##     layout
library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(png)
library(ggpubr)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------- tidyverse 1.3.0 --
## v tibble  3.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## v purrr   0.3.4
## -- Conflicts ----------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks plotly::filter(), stats::filter()
## x dplyr::lag()    masks stats::lag()
library(caTools)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(tree)
## Registered S3 method overwritten by 'tree':
##   method     from
##   print.tree cli
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:plotly':
## 
##     select
library(randomForest)
## randomForest 4.6-14
## Type rfNews() to see new features/changes/bug fixes.
## 
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
## 
##     combine
## The following object is masked from 'package:ggplot2':
## 
##     margin
library(ranger)
## 
## Attaching package: 'ranger'
## The following object is masked from 'package:randomForest':
## 
##     importance
library(tuneRanger)
## Loading required package: mlrMBO
## Loading required package: mlr
## Loading required package: ParamHelpers
## 'mlr' is in maintenance mode since July 2019. Future development
## efforts will go into its successor 'mlr3' (<https://mlr3.mlr-org.com>).
## 
## Attaching package: 'mlr'
## The following object is masked from 'package:caret':
## 
##     train
## Loading required package: smoof
## Loading required package: checkmate
## Loading required package: parallel
## Loading required package: lubridate
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
## Loading required package: lhs
library(keras)

library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(clustMixType)
library(cluster)
library(dendextend) 
## 
## ---------------------
## Welcome to dendextend version 1.13.4
## Type citation('dendextend') for how to cite the package.
## 
## Type browseVignettes(package = 'dendextend') for the package vignette.
## The github page is: https://github.com/talgalili/dendextend/
## 
## Suggestions and bug-reports can be submitted at: https://github.com/talgalili/dendextend/issues
## Or contact: <tal.galili@gmail.com>
## 
##  To suppress this message use:  suppressPackageStartupMessages(library(dendextend))
## ---------------------
## 
## Attaching package: 'dendextend'
## The following object is masked from 'package:ggpubr':
## 
##     rotate
## The following object is masked from 'package:stats':
## 
##     cutree
library(readr)

library(factoextra)
## Welcome! Want to learn more? See two factoextra-related books at https://goo.gl/ve3WBa
library(FactoMineR)


library(PCAmixdata)

 

Dataset

Link of the dataset

ds = read.csv("AB_NYC_2019.csv")

 

Data Inspection

head(ds)
summary(ds)
##        id               name              host_id           host_name        
##  Min.   :    2539   Length:48895       Min.   :     2438   Length:48895      
##  1st Qu.: 9471945   Class :character   1st Qu.:  7822033   Class :character  
##  Median :19677284   Mode  :character   Median : 30793816   Mode  :character  
##  Mean   :19017143                      Mean   : 67620011                     
##  3rd Qu.:29152178                      3rd Qu.:107434423                     
##  Max.   :36487245                      Max.   :274321313                     
##                                                                              
##  neighbourhood_group neighbourhood         latitude       longitude     
##  Length:48895        Length:48895       Min.   :40.50   Min.   :-74.24  
##  Class :character    Class :character   1st Qu.:40.69   1st Qu.:-73.98  
##  Mode  :character    Mode  :character   Median :40.72   Median :-73.96  
##                                         Mean   :40.73   Mean   :-73.95  
##                                         3rd Qu.:40.76   3rd Qu.:-73.94  
##                                         Max.   :40.91   Max.   :-73.71  
##                                                                         
##   room_type             price         minimum_nights    number_of_reviews
##  Length:48895       Min.   :    0.0   Min.   :   1.00   Min.   :  0.00   
##  Class :character   1st Qu.:   69.0   1st Qu.:   1.00   1st Qu.:  1.00   
##  Mode  :character   Median :  106.0   Median :   3.00   Median :  5.00   
##                     Mean   :  152.7   Mean   :   7.03   Mean   : 23.27   
##                     3rd Qu.:  175.0   3rd Qu.:   5.00   3rd Qu.: 24.00   
##                     Max.   :10000.0   Max.   :1250.00   Max.   :629.00   
##                                                                          
##  last_review        reviews_per_month calculated_host_listings_count
##  Length:48895       Min.   : 0.010    Min.   :  1.000               
##  Class :character   1st Qu.: 0.190    1st Qu.:  1.000               
##  Mode  :character   Median : 0.720    Median :  1.000               
##                     Mean   : 1.373    Mean   :  7.144               
##                     3rd Qu.: 2.020    3rd Qu.:  2.000               
##                     Max.   :58.500    Max.   :327.000               
##                     NA's   :10052                                   
##  availability_365
##  Min.   :  0.0   
##  1st Qu.:  0.0   
##  Median : 45.0   
##  Mean   :112.8   
##  3rd Qu.:227.0   
##  Max.   :365.0   
## 
cat("Shape of the dataset:" ,dim(ds))
## Shape of the dataset: 48895 16
img <- readPNG("map.png")


map_ds = ggplot() + background_image(img)+ geom_point(data = ds,  aes(y=latitude,x = longitude, color = price)) 
map_ds

  

Data cleaning

 

Check for NA and NULL values

apply(ds,2,function(x) sum(is.na(x)))
##                             id                           name 
##                              0                              0 
##                        host_id                      host_name 
##                              0                              0 
##            neighbourhood_group                  neighbourhood 
##                              0                              0 
##                       latitude                      longitude 
##                              0                              0 
##                      room_type                          price 
##                              0                              0 
##                 minimum_nights              number_of_reviews 
##                              0                              0 
##                    last_review              reviews_per_month 
##                              0                          10052 
## calculated_host_listings_count               availability_365 
##                              0                              0

 

Variable selection

dataset = ds  %>% dplyr::select(neighbourhood_group,latitude, longitude, room_type,price)

head(dataset)

 

Variable scaling

scale_data = function(df)
{
  df = df %>%filter( price >= 15  & price <= 500)
  
  numerical = c("price")
  numerical2 = c("latitude","longitude")
  categorical = c("room_type", "neighbourhood_group")
  
  for( cat in categorical )
  {
    df[cat] = factor(df[[cat]], 
                     level = unique(df[[cat]]), 
                     labels = c(1:length(unique(df[[cat]])) ))
  }
  
  df[numerical] = as.numeric(scale(df[numerical]))
  
  
  df2 = df
  df2[numerical2] = as.numeric(scale(df2[numerical2]))
  df3 = list()
  df3$df2 = df2
  df3$df = df
  
  
  return(df3)
  
  
}
dataframe = scale_data(dataset)

dataset = dataframe$df

data = dataframe$df2

head(dataset)
summary(dataset)
##  neighbourhood_group    latitude       longitude      room_type
##  1:19858             Min.   :40.50   Min.   :-74.24   1:22167  
##  2:20877             1st Qu.:40.69   1st Qu.:-73.98   2:24503  
##  3: 5632             Median :40.72   Median :-73.96   3: 1145  
##  4:  366             Mean   :40.73   Mean   :-73.95            
##  5: 1082             3rd Qu.:40.76   3rd Qu.:-73.94            
##                      Max.   :40.91   Max.   :-73.71            
##      price        
##  Min.   :-1.3248  
##  1st Qu.:-0.7228  
##  Median :-0.3479  
##  Mean   : 0.0000  
##  3rd Qu.: 0.4587  
##  Max.   : 4.1847

 

Data visualisation after the scaling

mappa = ggplot() + background_image(img)+ geom_point(data = dataset,  aes(y=latitude,x = longitude, color = price)) 
mappa

 

Data split

Split data in subsets for each neighbourhood_group and room_type

neighbourhoods = unique(dataset$neighbourhood_group)

rooms = unique(dataset$room_type)

clust_data = vector("list")

lis_n = vector("list")

for (n in neighbourhoods)
{
  tmp = dataset %>%filter( neighbourhood_group == n) 
  lis_n[[n]] = tmp[-1]
  
  tmp2 = data %>%filter( neighbourhood_group == n) 
  clust_data[[n]] = tmp2[-1]
  clust_data[[n]]$room_type = factor(clust_data[[n]]$room_type , level = unique(clust_data[[n]]$room_type) , labels= unique(ds$room_type))
}



lis_r_n= vector("list")
for (n in neighbourhoods)
{
  for(r in rooms)
  {
    tmp = dataset %>%
      filter( room_type == r  & neighbourhood_group == n) 
    lis_r_n[[paste0("n",n,"-","r",r)]]= tmp[-1][-3]
    
    
    tmp2 = data %>%
      filter( room_type == r  & neighbourhood_group == n) 
    clust_data[[paste0("n",n,"-","r",r)]]= tmp2[-1][-3]
    
  }
}


data$neighbourhood_group = factor(data$neighbourhood_group  , level = unique(data$neighbourhood_group ) , labels= unique(ds$neighbourhood_group)) 
data$room_type = factor(data$room_type  , level = unique(data$room_type ) , labels= unique(ds$room_type)) 

clust_data[["all"]] = data

 

Split in train and test for each subset

trains = vector("list")
tests = vector("list")
datas = vector("list")

for (i in names(lis_n))
{
  sample = sample.split(lis_n[[i]], SplitRatio = .75)
  train = subset(lis_n[[i]], sample == TRUE)
  test  = subset(lis_n[[i]], sample == FALSE)
  trains[[i]]=  train
  tests[[i]] = test
  datas[[i]] = lis_n[[i]]
}

for (i in names(lis_r_n))
{
  sample = sample.split(lis_r_n[[i]], SplitRatio = .75)
  train = subset(lis_r_n[[i]], sample == TRUE)
  test  = subset(lis_r_n[[i]], sample == FALSE)
  trains[[i]]=  train
  tests[[i]] = test
  datas[[i]] = lis_r_n[[i]]
  
}

sample = sample.split(dataset, SplitRatio = .75)
train = subset(dataset, sample == TRUE)
test  = subset(dataset, sample == FALSE)

trains[["all"]] = train
tests[["all"]] = test
datas[["all"]] = dataset

 

MODELS TRAIN

 

model_lis = vector("list")

 

LINEAR REGRESSION

lin_reg = vector("list")

for (sub in names(trains))
{
  lin_reg[[sub]]$fit =lm.fit = lm(price~., data = trains[[sub]])
  lin_reg[[sub]]$summary = summary(lm.fit)
  
  lin_reg[[sub]]$pred  = pr.lm = predict(lm.fit,tests[[sub]])
  
  lin_reg[[sub]]$MSE = sum((pr.lm - tests[[sub]]$price)^2)/nrow(tests[[sub]])
  print(paste0("========== ",sub, " =========="))
  print(summary(lm.fit))
  cat("\n\n")
}
## [1] "========== 1 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8345 -0.3464 -0.1178  0.1731  4.9573 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -548.77851   20.08969 -27.316  < 2e-16 ***
## latitude       5.37557    0.20584  26.115  < 2e-16 ***
## longitude     -4.45432    0.22293 -19.980  < 2e-16 ***
## room_type2     0.97147    0.01122  86.566  < 2e-16 ***
## room_type3    -0.17381    0.03864  -4.499 6.89e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6676 on 14888 degrees of freedom
## Multiple R-squared:  0.3904, Adjusted R-squared:  0.3902 
## F-statistic:  2383 on 4 and 14888 DF,  p-value: < 2.2e-16
## 
## 
## 
## [1] "========== 2 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4027 -0.5349 -0.1876  0.2847  4.7542 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -841.73748   58.43516 -14.405  < 2e-16 ***
## latitude      -0.51541    0.35505  -1.452    0.147    
## longitude    -11.65953    0.61678 -18.904  < 2e-16 ***
## room_type2     1.01447    0.01503  67.505  < 2e-16 ***
## room_type3    -0.28394    0.04780  -5.941  2.9e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8812 on 15653 degrees of freedom
## Multiple R-squared:  0.3352, Adjusted R-squared:  0.3351 
## F-statistic:  1973 on 4 and 15653 DF,  p-value: < 2.2e-16
## 
## 
## 
## [1] "========== 3 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3742 -0.2988 -0.1270  0.1391  5.0902 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -7.94589   12.53702  -0.634  0.52625    
## latitude    -0.67106    0.28710  -2.337  0.01947 *  
## longitude   -0.46762    0.20025  -2.335  0.01958 *  
## room_type2   0.80929    0.01951  41.478  < 2e-16 ***
## room_type3  -0.17440    0.05165  -3.377  0.00074 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6028 on 4219 degrees of freedom
## Multiple R-squared:  0.3044, Adjusted R-squared:  0.3037 
## F-statistic: 461.5 on 4 and 4219 DF,  p-value: < 2.2e-16
## 
## 
## 
## [1] "========== 4 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9102 -0.3342 -0.1469  0.2236  3.4561 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -41.03733  131.03461  -0.313    0.754    
## latitude     -0.46768    1.38088  -0.339    0.735    
## longitude    -0.79964    1.23282  -0.649    0.517    
## room_type2    0.69732    0.07446   9.365   <2e-16 ***
## room_type3   -0.13585    0.23435  -0.580    0.563    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6009 on 270 degrees of freedom
## Multiple R-squared:  0.2664, Adjusted R-squared:  0.2555 
## F-statistic: 24.51 on 4 and 270 DF,  p-value: < 2.2e-16
## 
## 
## 
## [1] "========== 5 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9669 -0.2906 -0.1359  0.1124  4.9772 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 62.09595   75.12511   0.827    0.409    
## latitude    -0.46076    0.89539  -0.515    0.607    
## longitude    0.59638    0.72879   0.818    0.413    
## room_type2   0.67380    0.04732  14.238   <2e-16 ***
## room_type3  -0.15156    0.09529  -1.591    0.112    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6266 on 806 degrees of freedom
## Multiple R-squared:  0.2199, Adjusted R-squared:  0.216 
## F-statistic: 56.79 on 4 and 806 DF,  p-value: < 2.2e-16
## 
## 
## 
## [1] "========== n1-r1 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7167 -0.2399 -0.0919  0.1142  5.1429 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -347.2980    19.5998  -17.72   <2e-16 ***
## latitude       2.9184     0.2014   14.49   <2e-16 ***
## longitude     -3.0816     0.2151  -14.32   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4287 on 6721 degrees of freedom
## Multiple R-squared:  0.04726,    Adjusted R-squared:  0.04698 
## F-statistic: 166.7 on 2 and 6721 DF,  p-value: < 2.2e-16
## 
## 
## 
## [1] "========== n1-r2 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.9631 -0.5724 -0.2106  0.2963  4.4458 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -764.9500    39.6271  -19.30   <2e-16 ***
## latitude       8.2186     0.4068   20.20   <2e-16 ***
## longitude     -5.8264     0.4454  -13.08   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8599 on 6238 degrees of freedom
## Multiple R-squared:  0.07359,    Adjusted R-squared:  0.07329 
## F-statistic: 247.7 on 2 and 6238 DF,  p-value: < 2.2e-16
## 
## 
## 
## [1] "========== n1-r3 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.43709 -0.22328 -0.13609  0.05886  2.72639 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -322.3727    94.7421  -3.403 0.000769 ***
## latitude       2.8413     0.7927   3.584 0.000401 ***
## longitude     -2.7841     1.0335  -2.694 0.007504 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4409 on 270 degrees of freedom
## Multiple R-squared:  0.05102,    Adjusted R-squared:  0.04399 
## F-statistic: 7.258 on 2 and 270 DF,  p-value: 0.0008505
## 
## 
## 
## [1] "========== n2-r1 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.2282 -0.3714 -0.1644  0.1214  4.7343 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -614.1915    82.3469  -7.459 1.02e-13 ***
## latitude      -0.8686     0.4705  -1.846    0.065 .  
## longitude     -8.7780     0.8779  -9.999  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6872 on 5253 degrees of freedom
## Multiple R-squared:  0.1038, Adjusted R-squared:  0.1035 
## F-statistic: 304.4 on 2 and 5253 DF,  p-value: < 2.2e-16
## 
## 
## 
## [1] "========== n2-r2 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.4464 -0.6834 -0.2428  0.4327  3.8755 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -888.5411    89.7697  -9.898   <2e-16 ***
## latitude      -0.9338     0.5683  -1.643      0.1    
## longitude    -12.5366     0.9398 -13.339   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9986 on 8343 degrees of freedom
## Multiple R-squared:  0.07902,    Adjusted R-squared:  0.0788 
## F-statistic: 357.9 on 2 and 8343 DF,  p-value: < 2.2e-16
## 
## 
## 
## [1] "========== n2-r3 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.7868 -0.3643 -0.2004  0.0641  4.7188 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1051.110    315.024  -3.337 0.000950 ***
## latitude        5.082      1.892   2.686 0.007624 ** 
## longitude     -11.401      3.362  -3.391 0.000786 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.722 on 314 degrees of freedom
## Multiple R-squared:  0.0354, Adjusted R-squared:  0.02925 
## F-statistic: 5.761 on 2 and 314 DF,  p-value: 0.00349
## 
## 
## 
## [1] "========== n3-r1 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5419 -0.2240 -0.0917  0.1024  4.9175 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -20.2460    12.3016  -1.646   0.0999 .
## latitude     -0.1631     0.2852  -0.572   0.5674  
## longitude    -0.3540     0.1887  -1.876   0.0608 .
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4257 on 2239 degrees of freedom
## Multiple R-squared:  0.001704,   Adjusted R-squared:  0.0008119 
## F-statistic: 1.911 on 2 and 2239 DF,  p-value: 0.1482
## 
## 
## 
## [1] "========== n3-r2 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.3955 -0.5444 -0.2012  0.3158  4.1267 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 31.86387   28.50462   1.118    0.264
## latitude    -0.65874    0.61449  -1.072    0.284
## longitude    0.06699    0.48209   0.139    0.889
## 
## Residual standard error: 0.8305 on 1381 degrees of freedom
## Multiple R-squared:  0.001545,   Adjusted R-squared:  9.897e-05 
## F-statistic: 1.068 on 2 and 1381 DF,  p-value: 0.3438
## 
## 
## 
## [1] "========== n3-r3 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.5401 -0.2595 -0.1545  0.0127  5.0126 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)  
## (Intercept) -101.808     90.766  -1.122   0.2641  
## latitude       4.512      2.232   2.022   0.0453 *
## longitude      1.122      1.312   0.855   0.3941  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6257 on 126 degrees of freedom
## Multiple R-squared:  0.03195,    Adjusted R-squared:  0.01659 
## F-statistic:  2.08 on 2 and 126 DF,  p-value: 0.1293
## 
## 
## 
## [1] "========== n4-r1 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.4399 -0.2616 -0.1008  0.1382  2.7047 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  41.8452   140.4584   0.298    0.766
## latitude     -1.3294     1.5633  -0.850    0.397
## longitude    -0.1532     1.3589  -0.113    0.910
## 
## Residual standard error: 0.4341 on 123 degrees of freedom
## Multiple R-squared:  0.008143,   Adjusted R-squared:  -0.007985 
## F-statistic: 0.5049 on 2 and 123 DF,  p-value: 0.6048
## 
## 
## 
## [1] "========== n4-r2 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -0.9720 -0.5710 -0.2809  0.3025  3.5485 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  264.212    293.718   0.900    0.370
## latitude      -1.752      3.055  -0.574    0.567
## longitude      2.605      2.687   0.970    0.334
## 
## Residual standard error: 0.8744 on 111 degrees of freedom
## Multiple R-squared:  0.00842,    Adjusted R-squared:  -0.009446 
## F-statistic: 0.4713 on 2 and 111 DF,  p-value: 0.6254
## 
## 
## 
## [1] "========== n4-r3 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##         1         2         4         5         7         8 
##  0.153761 -0.025443 -0.290715  0.569389 -0.397762 -0.009229 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -2919.25    3307.89  -0.883    0.442
## latitude       46.13      21.25   2.171    0.118
## longitude     -14.11      37.85  -0.373    0.734
## 
## Residual standard error: 0.444 on 3 degrees of freedom
## Multiple R-squared:  0.6387, Adjusted R-squared:  0.3979 
## F-statistic: 2.652 on 2 and 3 DF,  p-value: 0.2172
## 
## 
## 
## [1] "========== n5-r1 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.46134 -0.21757 -0.08786  0.10675  2.68073 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  94.6199    61.1603   1.547    0.123
## latitude     -0.5680     0.7298  -0.778    0.437
## longitude     0.9775     0.6102   1.602    0.110
## 
## Residual standard error: 0.3838 on 428 degrees of freedom
## Multiple R-squared:  0.006174,   Adjusted R-squared:  0.00153 
## F-statistic: 1.329 on 2 and 428 DF,  p-value: 0.2657
## 
## 
## 
## [1] "========== n5-r2 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -1.1884 -0.5191 -0.2658  0.1793  4.2156 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) -53.3410   175.2402  -0.304    0.761
## latitude      1.6975     2.1824   0.778    0.437
## longitude     0.2177     1.6531   0.132    0.895
## 
## Residual standard error: 0.9039 on 248 degrees of freedom
## Multiple R-squared:  0.003215,   Adjusted R-squared:  -0.004823 
## F-statistic:   0.4 on 2 and 248 DF,  p-value: 0.6708
## 
## 
## 
## [1] "========== n5-r3 =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.34642 -0.21654 -0.11738  0.07302  1.34094 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept)  49.9124   260.3294   0.192    0.849
## latitude     -0.3451     2.9279  -0.118    0.907
## longitude     0.4976     2.3626   0.211    0.834
## 
## Residual standard error: 0.3916 on 37 degrees of freedom
## Multiple R-squared:  0.001198,   Adjusted R-squared:  -0.05279 
## F-statistic: 0.02218 on 2 and 37 DF,  p-value: 0.9781
## 
## 
## 
## [1] "========== all =========="
## 
## Call:
## lm(formula = price ~ ., data = trains[[sub]])
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1739 -0.4434 -0.1419  0.2269  4.9515 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          -1.916e+02  1.420e+01 -13.497  < 2e-16 ***
## neighbourhood_group2  5.023e-01  1.618e-02  31.041  < 2e-16 ***
## neighbourhood_group3  2.216e-01  1.978e-02  11.202  < 2e-16 ***
## neighbourhood_group4 -9.535e-01  5.882e-02 -16.211  < 2e-16 ***
## neighbourhood_group5  2.687e-01  3.914e-02   6.865 6.81e-12 ***
## latitude             -1.726e+00  1.393e-01 -12.385  < 2e-16 ***
## longitude            -3.532e+00  1.595e-01 -22.145  < 2e-16 ***
## room_type2            9.996e-01  9.622e-03 103.887  < 2e-16 ***
## room_type3           -2.379e-01  3.068e-02  -7.754 9.18e-15 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.784 on 28680 degrees of freedom
## Multiple R-squared:  0.3914, Adjusted R-squared:  0.3912 
## F-statistic:  2306 on 8 and 28680 DF,  p-value: < 2.2e-16
lin_reg$name = "Linear Regression"
model_lis$linear_regression=  lin_reg

 

DECISION TREE

dec_tree = vector("list")

for (sub in names(trains))
{
  dec_tree[[sub]]$fit = tree_res=tree(price~., data = trains[[sub]])
  dec_tree[[sub]]$summary = sum = summary(tree_res)
  
  print(paste0("========== ",sub, " =========="))

  print(sum)
  
  if(sum$size > 1 )
  {
    plot(tree_res)
    text(tree_res,pretty=0)
    title(paste0("Tree of: ",sub))
    
   
  }
  else 
  {
    cat("Not possible to plot tree: ", sub)
  }
  
  dec_tree[[sub]]$pred  = pred = predict(tree_res,tests[[sub]])
  
  dec_tree[[sub]]$MSE = mse =  sum((pred - tests[[sub]]$price)^2)/nrow(tests[[sub]])
  print(mse)
  cat("\n\n")
}
## [1] "========== 1 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  4 
## Residual mean deviance:  0.4488 = 6683 / 14890 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.9330 -0.3443 -0.1171  0.0000  0.1669  4.8810

## [1] 0.478872
## 
## 
## [1] "========== 2 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  4 
## Residual mean deviance:  0.7811 = 12230 / 15650 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.2970 -0.5221 -0.1952  0.0000  0.3185  4.7990

## [1] 0.7822847
## 
## 
## [1] "========== 3 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## [1] "room_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  0.3647 = 1540 / 4222 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.3900 -0.3014 -0.1245  0.0000  0.1319  4.9310

## [1] 0.3864543
## 
## 
## [1] "========== 4 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  9 
## Residual mean deviance:  0.2809 = 74.73 / 266 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.8840 -0.2813 -0.1205  0.0000  0.2203  2.4320

## [1] 0.4848371
## 
## 
## [1] "========== 5 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## [1] "room_type"
## Number of terminal nodes:  2 
## Residual mean deviance:  0.3927 = 317.7 / 809 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.9862 -0.2943 -0.1239  0.0000  0.1033  4.9880

## [1] 0.2744778
## 
## 
## [1] "========== n1-r1 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  4 
## Residual mean deviance:  0.1783 = 1198 / 6720 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.75440 -0.22280 -0.07515  0.00000  0.11800  4.94600

## [1] 0.1691098
## 
## 
## [1] "========== n1-r2 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  4 
## Residual mean deviance:  0.731 = 4559 / 6237 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.9250 -0.5518 -0.2070  0.0000  0.3002  4.0740

## [1] 0.7503308
## 
## 
## [1] "========== n1-r3 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  8 
## Residual mean deviance:  0.1442 = 38.21 / 265 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.02700 -0.15190 -0.06911  0.00000  0.04448  2.05000

## [1] 0.2505339
## 
## 
## [1] "========== n2-r1 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  8 
## Residual mean deviance:  0.3902 = 2048 / 5248 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.5970 -0.3301 -0.1267  0.0000  0.1243  4.8870

## [1] 0.3882155
## 
## 
## [1] "========== n2-r2 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## [1] "longitude"
## Number of terminal nodes:  2 
## Residual mean deviance:  1.01 = 8431 / 8344 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.3060 -0.7175 -0.2063  0.0000  0.4071  3.8260

## [1] 0.9585893
## 
## 
## [1] "========== n2-r3 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  12 
## Residual mean deviance:  0.4186 = 127.7 / 305 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.99000 -0.25310 -0.07929  0.00000  0.11360  3.77600

## [1] 0.7380578
## 
## 
## [1] "========== n3-r1 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  3 
## Residual mean deviance:  0.1738 = 389.1 / 2239 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.85200 -0.22640 -0.08418  0.00000  0.11440  4.94200

## [1] 0.1600965
## 
## 
## [1] "========== n3-r2 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  5 
## Residual mean deviance:  0.6466 = 891.7 / 1379 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.9740 -0.5074 -0.2091  0.0000  0.2925  4.1950

## [1] 0.6591288
## 
## 
## [1] "========== n3-r3 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  6 
## Residual mean deviance:  0.2555 = 31.43 / 123 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.81800 -0.15550 -0.07597  0.00000  0.05964  3.40800

## [1] 0.2989787
## 
## 
## [1] "========== n4-r1 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  7 
## Residual mean deviance:  0.1462 = 17.4 / 119 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.81560 -0.18000 -0.03524  0.00000  0.09394  2.13800

## [1] 0.1605553
## 
## 
## [1] "========== n4-r2 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  8 
## Residual mean deviance:  0.6078 = 64.43 / 106 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -1.5870 -0.4486 -0.1186  0.0000  0.2637  2.9850

## [1] 0.4368075
## 
## 
## [1] "========== n4-r3 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## character(0)
## Number of terminal nodes:  1 
## Residual mean deviance:  0.3273 = 1.637 / 5 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -0.5301 -0.4165 -0.1609  0.0000  0.2651  0.9466 
## Not possible to plot tree:  n4-r3[1] 0.09602265
## 
## 
## [1] "========== n5-r1 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  6 
## Residual mean deviance:  0.1351 = 57.44 / 425 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.90420 -0.20040 -0.08683  0.00000  0.08357  2.75300

## [1] 0.2159434
## 
## 
## [1] "========== n5-r2 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Number of terminal nodes:  8 
## Residual mean deviance:  0.6961 = 169.1 / 243 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.2490 -0.4638 -0.1854  0.0000  0.2079  3.7900

## [1] 0.7622148
## 
## 
## [1] "========== n5-r3 =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## [1] "latitude"
## Number of terminal nodes:  6 
## Residual mean deviance:  0.1029 = 3.5 / 34 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -0.52540 -0.15660 -0.04544  0.00000  0.04203  0.92870

## [1] 0.04892947
## 
## 
## [1] "========== all =========="
## 
## Regression tree:
## tree(formula = price ~ ., data = trains[[sub]])
## Variables actually used in tree construction:
## [1] "room_type" "longitude" "latitude" 
## Number of terminal nodes:  5 
## Residual mean deviance:  0.6082 = 17450 / 28680 
## Distribution of residuals:
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.3090 -0.4252 -0.1440  0.0000  0.2223  4.8800

## [1] 0.6005651
dec_tree$name = "Decision Tree"
model_lis$decision_tree = dec_tree

 

RANDOM FOREST

rf = vector("list")

for (sub in names(trains))
{
  rf[[sub]]$fit = res =  randomForest(  price ~ . , data=trains[[sub]])
  
  rf[[sub]]$pred = predt = predict(res,tests[[sub]])
  
  print(paste0("========== ",sub, " =========="))

  print(res)
  
  rf[[sub]]$MSE = mse = sum((predt - tests[[sub]]$price)^2)/nrow(tests[[sub]])
  print(paste0("MSE: ",mse))
  cat("\n\n")
}
## [1] "========== 1 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.4275465
##                     % Var explained: 41.5
## [1] "MSE: 0.455167194530864"
## 
## 
## [1] "========== 2 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.7341265
##                     % Var explained: 37.13
## [1] "MSE: 0.732725058434287"
## 
## 
## [1] "========== 3 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.3475054
##                     % Var explained: 33.39
## [1] "MSE: 0.36031765127259"
## 
## 
## [1] "========== 4 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.3760561
##                     % Var explained: 22.19
## [1] "MSE: 0.447650149937715"
## 
## 
## [1] "========== 5 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.3902594
##                     % Var explained: 21.96
## [1] "MSE: 0.272695971076505"
## 
## 
## [1] "========== n1-r1 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.1889355
##                     % Var explained: 2.03
## [1] "MSE: 0.179037923876284"
## 
## 
## [1] "========== n1-r2 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.7834144
##                     % Var explained: 1.81
## [1] "MSE: 0.803170390195746"
## 
## 
## [1] "========== n1-r3 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.193599
##                     % Var explained: 4.44
## [1] "MSE: 0.237242915384785"
## 
## 
## [1] "========== n2-r1 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.4037602
##                     % Var explained: 23.33
## [1] "MSE: 0.372569290370662"
## 
## 
## [1] "========== n2-r2 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 1.036605
##                     % Var explained: 4.22
## [1] "MSE: 0.977980628310895"
## 
## 
## [1] "========== n2-r3 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.5747621
##                     % Var explained: -7.36
## [1] "MSE: 0.728586903061683"
## 
## 
## [1] "========== n3-r1 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.1710464
##                     % Var explained: 5.67
## [1] "MSE: 0.162726364082157"
## 
## 
## [1] "========== n3-r2 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.7131623
##                     % Var explained: -3.47
## [1] "MSE: 0.65418966508262"
## 
## 
## [1] "========== n3-r3 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.4474675
##                     % Var explained: -13.3
## [1] "MSE: 0.224070733281232"
## 
## 
## [1] "========== n4-r1 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.1859426
##                     % Var explained: -0.26
## [1] "MSE: 0.142122318467766"
## 
## 
## [1] "========== n4-r2 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.956762
##                     % Var explained: -27.45
## [1] "MSE: 0.486763656664129"
## Warning in randomForest.default(m, y, ...): The response has five or fewer
## unique values. Are you sure you want to do regression?
## [1] "========== n4-r3 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.2507168
##                     % Var explained: 8.09
## [1] "MSE: 0.106142985149516"
## 
## 
## [1] "========== n5-r1 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.16317
##                     % Var explained: -10.89
## [1] "MSE: 0.216914168102416"
## 
## 
## [1] "========== n5-r2 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.8909938
##                     % Var explained: -10.02
## [1] "MSE: 0.642336872850853"
## 
## 
## [1] "========== n5-r3 =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.1977788
##                     % Var explained: -39.25
## [1] "MSE: 0.0534407934234958"
## 
## 
## [1] "========== all =========="
## 
## Call:
##  randomForest(formula = price ~ ., data = trains[[sub]]) 
##                Type of random forest: regression
##                      Number of trees: 500
## No. of variables tried at each split: 1
## 
##           Mean of squared residuals: 0.5793331
##                     % Var explained: 42.61
## [1] "MSE: 0.567668204356476"
rf$name = "Random Forest"
model_lis$random_forest = rf

 

RANGER RANDOM FOREST

ranger_rf = vector("list")

for (sub in names(trains))
{
  
   print(paste0("========== ",sub, " =========="))

  ranger_rf[[sub]]$fit = res = ranger( price~ ., data = trains[[sub]], write.forest = TRUE, classification = F)
  
  ranger_rf[[sub]]$pred = predt = predict(res,tests[[sub]])
  ranger_rf[[sub]]$MSE = mse =  sum((predt$predictions - tests[[sub]]$price)^2)/nrow(tests[[sub]])
  print(res)
  print(paste0("MSE: ",mse))
  cat("\n\n")
}
## [1] "========== 1 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      14893 
## Number of independent variables:  3 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.4250408 
## R squared (OOB):                  0.418418 
## [1] "MSE: 0.451498508118412"
## 
## 
## [1] "========== 2 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      15658 
## Number of independent variables:  3 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.7335627 
## R squared (OOB):                  0.371838 
## [1] "MSE: 0.732041957369135"
## 
## 
## [1] "========== 3 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      4224 
## Number of independent variables:  3 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.3463108 
## R squared (OOB):                  0.3363436 
## [1] "MSE: 0.359392908585742"
## 
## 
## [1] "========== 4 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      275 
## Number of independent variables:  3 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.3774909 
## R squared (OOB):                  0.221737 
## [1] "MSE: 0.448427295260819"
## 
## 
## [1] "========== 5 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      811 
## Number of independent variables:  3 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.3975296 
## R squared (OOB):                  0.2060783 
## [1] "MSE: 0.274350614878704"
## 
## 
## [1] "========== n1-r1 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      6724 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.1891423 
## R squared (OOB):                  0.01938132 
## [1] "MSE: 0.179420027053377"
## 
## 
## [1] "========== n1-r2 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      6241 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.783826 
## R squared (OOB):                  0.01775623 
## [1] "MSE: 0.801841176308322"
## 
## 
## [1] "========== n1-r3 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      273 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.1959439 
## R squared (OOB):                  0.03639428 
## [1] "MSE: 0.240502896012614"
## 
## 
## [1] "========== n2-r1 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      5256 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.4018633 
## R squared (OOB):                  0.2370543 
## [1] "MSE: 0.372211157839943"
## 
## 
## [1] "========== n2-r2 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      8346 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       1.036172 
## R squared (OOB):                  0.04272476 
## [1] "MSE: 0.976239444208424"
## 
## 
## [1] "========== n2-r3 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      317 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.5750939 
## R squared (OOB):                  -0.07087803 
## [1] "MSE: 0.745111227940539"
## 
## 
## [1] "========== n3-r1 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      2242 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.1719263 
## R squared (OOB):                  0.05225914 
## [1] "MSE: 0.161653618970503"
## 
## 
## [1] "========== n3-r2 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      1384 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.7135851 
## R squared (OOB):                  -0.03460708 
## [1] "MSE: 0.658703608163795"
## 
## 
## [1] "========== n3-r3 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      129 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.4445584 
## R squared (OOB):                  -0.1168601 
## [1] "MSE: 0.240502052536699"
## 
## 
## [1] "========== n4-r1 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      126 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.1873397 
## R squared (OOB):                  -0.002135701 
## [1] "MSE: 0.140282184958391"
## 
## 
## [1] "========== n4-r2 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      114 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.9489554 
## R squared (OOB):                  -0.2530063 
## [1] "MSE: 0.493786919827387"
## 
## 
## [1] "========== n4-r3 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      6 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.2510545 
## R squared (OOB):                  0.233047 
## [1] "MSE: 0.0920501022957262"
## 
## 
## [1] "========== n5-r1 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      431 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.1640681 
## R squared (OOB):                  -0.112398 
## [1] "MSE: 0.215429146469569"
## 
## 
## [1] "========== n5-r2 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      251 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.9027592 
## R squared (OOB):                  -0.1103079 
## [1] "MSE: 0.643759831987133"
## 
## 
## [1] "========== n5-r3 =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      40 
## Number of independent variables:  2 
## Mtry:                             1 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.1944478 
## R squared (OOB):                  -0.3348204 
## [1] "MSE: 0.0515924800078338"
## 
## 
## [1] "========== all =========="
## Ranger result
## 
## Call:
##  ranger(price ~ ., data = trains[[sub]], write.forest = TRUE,      classification = F) 
## 
## Type:                             Regression 
## Number of trees:                  500 
## Sample size:                      28689 
## Number of independent variables:  4 
## Mtry:                             2 
## Target node size:                 5 
## Variable importance mode:         none 
## Splitrule:                        variance 
## OOB prediction error (MSE):       0.5497237 
## R squared (OOB):                  0.4554978 
## [1] "MSE: 0.543636080000731"
ranger_rf$name = "Ranger Random Forest"
model_lis$ranger = ranger_rf

 

NEURAL NETWORKS

build_model <- function(dimension) {
  
  model <- keras::keras_model_sequential() %>%
    layer_dense(units = 32, activation = "relu",
                input_shape = dimension) %>%
    layer_dense(units = 16, activation = "relu") %>%
    layer_dense(units = 1,activation="linear")
  
  model %>% compile(
    loss = "mse",
    optimizer = optimizer_rmsprop(),
    metrics = list("mean_absolute_error")
  )
  
  return(model)
}


print_dot_callback <- callback_lambda(
  on_epoch_end = function(epoch, logs) {
    if (epoch %% 1 == 0) 
      cat(".")
  }
)    


nn = vector("list")

for (sub in names(trains))
{
  d = trains[[sub]]
  d2 = tests[[sub]]
  len = length(d)
  len2 = length(d2)
  
  if(!is.null(d$room_type))
  {
    d$room_type = keras::to_categorical(d$room_type)
    d2$room_type = keras::to_categorical(d2$room_type)
    
  }
  
  if(!is.null(d$neighbourhood_group)) 
  {
    d$neighbourhood_group = keras::to_categorical(d$neighbourhood_group)
    d2$neighbourhood_group = keras::to_categorical(d2$neighbourhood_group)
  }
  
  
  target = as.vector(d$price)
  features = as.matrix(as_tibble(d[-len]))
  
  target_test = as.vector(d2$price)
  features_test = as.matrix(as_tibble(d2[-len]))
  
  
  nn[[sub]]$epochs = epochs =  30
  
  nn[[sub]]$model = model =  build_model(dim(features)[2])
  
  nn[[sub]]$summary = model %>% summary()
  nn[[sub]]$history = hist =  model %>% fit(
    x = features,
    y = target,
    epochs = epochs,
    validation_split = 0.2,
    verbose = 0,
    callbacks = list(print_dot_callback)
  )
  eva = model %>% evaluate(features_test,target_test, verbose = 0)
  
  
  nn[[sub]]$mae = eva[1]
  nn[[sub]]$loss = eva[2]
  
  nn[[sub]]$pred = pred = model %>% predict(features_test)
  nn[[sub]]$MSE = sum((pred - target_test)^2)/length(target_test)
  
}
## Model: "sequential"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense (Dense)                       (None, 32)                      224         
## ________________________________________________________________________________
## dense_1 (Dense)                     (None, 16)                      528         
## ________________________________________________________________________________
## dense_2 (Dense)                     (None, 1)                       17          
## ================================================================================
## Total params: 769
## Trainable params: 769
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_1"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_3 (Dense)                     (None, 32)                      224         
## ________________________________________________________________________________
## dense_4 (Dense)                     (None, 16)                      528         
## ________________________________________________________________________________
## dense_5 (Dense)                     (None, 1)                       17          
## ================================================================================
## Total params: 769
## Trainable params: 769
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_2"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_6 (Dense)                     (None, 32)                      224         
## ________________________________________________________________________________
## dense_7 (Dense)                     (None, 16)                      528         
## ________________________________________________________________________________
## dense_8 (Dense)                     (None, 1)                       17          
## ================================================================================
## Total params: 769
## Trainable params: 769
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_3"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_9 (Dense)                     (None, 32)                      224         
## ________________________________________________________________________________
## dense_10 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_11 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 769
## Trainable params: 769
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_4"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_12 (Dense)                    (None, 32)                      224         
## ________________________________________________________________________________
## dense_13 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_14 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 769
## Trainable params: 769
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_5"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_15 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_16 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_17 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_6"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_18 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_19 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_20 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_7"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_21 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_22 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_23 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_8"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_24 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_25 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_26 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_9"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_27 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_28 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_29 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_10"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_30 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_31 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_32 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_11"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_33 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_34 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_35 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_12"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_36 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_37 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_38 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_13"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_39 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_40 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_41 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_14"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_42 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_43 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_44 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_15"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_45 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_46 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_47 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_16"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_48 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_49 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_50 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_17"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_51 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_52 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_53 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_18"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_54 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_55 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_56 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_19"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_57 (Dense)                    (None, 32)                      96          
## ________________________________________________________________________________
## dense_58 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_59 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 641
## Trainable params: 641
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................Model: "sequential_20"
## ________________________________________________________________________________
## Layer (type)                        Output Shape                    Param #     
## ================================================================================
## dense_60 (Dense)                    (None, 32)                      416         
## ________________________________________________________________________________
## dense_61 (Dense)                    (None, 16)                      528         
## ________________________________________________________________________________
## dense_62 (Dense)                    (None, 1)                       17          
## ================================================================================
## Total params: 961
## Trainable params: 961
## Non-trainable params: 0
## ________________________________________________________________________________
## ..............................
nn$name = "Neural Networks"
model_lis$neural_networks = nn

 

NN plots

for (sub in names(nn))
{
  if(sub != "name")
  {
    m = nn[[sub]]
    hist = m$history
    print(paste0("========== ",sub, " =========="))
    str = paste0("MAE: ",round(m$mae,3)," --- Loss: ",round(m$loss,3))
    p = plot(hist, y ~ x) +  theme_bw(base_size = 12)  +ggtitle(paste0("NN of: ",sub))+ labs(caption= str)
    
    print(p)
    plot(p)
    cat("MAE: ",m$mae)
    cat("\nLoss: ",m$loss,"\n\n")
  }
  
}
## [1] "========== 1 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.6122801
## Loss:  0.4845496 
## 
## [1] "========== 2 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.8479623
## Loss:  0.6393498 
## 
## [1] "========== 3 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.4926897
## Loss:  0.4206005 
## 
## [1] "========== 4 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.5653926
## Loss:  0.5909714 
## 
## [1] "========== 5 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.9475082
## Loss:  0.8239455 
## 
## [1] "========== n1-r1 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.3691125
## Loss:  0.4329883 
## 
## [1] "========== n1-r2 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.857663
## Loss:  0.6326119 
## 
## [1] "========== n1-r3 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.4154404
## Loss:  0.5541389 
## 
## [1] "========== n2-r1 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.4940058
## Loss:  0.4452811 
## 
## [1] "========== n2-r2 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  1.069584
## Loss:  0.8379608 
## 
## [1] "========== n2-r3 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  1.214825
## Loss:  1.012357 
## 
## [1] "========== n3-r1 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.1678099
## Loss:  0.2578662 
## 
## [1] "========== n3-r2 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.6874239
## Loss:  0.5797735 
## 
## [1] "========== n3-r3 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.0545099
## Loss:  0.1592267 
## 
## [1] "========== n4-r1 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.1458065
## Loss:  0.3156902 
## 
## [1] "========== n4-r2 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.3665866
## Loss:  0.492869 
## 
## [1] "========== n4-r3 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.3569993
## Loss:  0.5379468 
## 
## [1] "========== n5-r1 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.2133904
## Loss:  0.2625024 
## 
## [1] "========== n5-r2 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  1.036595
## Loss:  0.7100449 
## 
## [1] "========== n5-r3 =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.07074246
## Loss:  0.2192494 
## 
## [1] "========== all =========="
## `geom_smooth()` using formula 'y ~ x'

## `geom_smooth()` using formula 'y ~ x'

## MAE:  0.6285941
## Loss:  0.5615722

 

Comparison between the models

 

conc = c()

for (n in unique(ds$neighbourhood_group))
{
  conc = c(conc,n) 
}
for (n in unique(ds$neighbourhood_group))
{
  for(r in unique(ds$room_type))
  {
    conc = c(conc, paste(n,"-",r))
    
  }
}
conc = c(conc,"All")

n = names(nn)



cols = vector("list")
cols[["Subset"]] = conc
for( m in model_lis)
{
  col = c()
  for( nam in n)
  {
    
    if( nam != "name")
    { 
      col = c(col,m[[nam]]$MSE)  
      
    }
    
  }
  cols[[m$name]] = col
  
}


mse_df = as.data.frame(cols)
best = c()
for (i in 1:length(cols$Subset))
{
  m = min(mse_df[i,2:6])
  col = which(mse_df[i,1:6 ] == m)
  nam = names(mse_df)[col]
  best = c(best, nam)
}
mse_df$Best = best
#kableExtra::kable(mse_df)%>%
#  kable_styling(bootstrap_options = "striped", full_width = F,font_size = 20) %>%
##  row_spec(0, bold = T, color = "white", background = "#D7261E")%>%
##  column_spec(1, bold = T, border_right = T,color = "white", background = "#191970")%>%
#  column_spec(2:6,extra_css="text-align:Center")

mse_df

  

 

Clustering and Groups

 

Clustering for Mixed type of data

clust_num = 5

get_clusters = function(dts, num, dim_plot,name)
{
  l = list()
  if(is.null(dts$room_type))
  {
    l$cl = kmeans(dts,num)
  }
  else
  {
    l$cl = kproto(dts,num)
    
  }
  
  clust = list()
  
  for (i in 1:num)
  {
    indexes = l$cl$cluster == i
    clust[[i]] = dts[indexes,]
  }
  
  min_lat =  dim_plot[1]
  max_lat = dim_plot[2]
  min_long = dim_plot[3]
  max_long= dim_plot[4]
  
  
  myplot= ggplot() +  background_image(img)+  xlab('Longitude') +  ylab('Latitude')+ theme(plot.margin = unit(c(1,1,1,1),"cm"),legend.title = element_text(colour="blue", size=10, face="bold")) + xlim(min_long, max_long) + ylim(min_lat,max_lat) +ggtitle(paste0("Clusters of ", name  ))
  
  
  count = 1
  l$clust_plot = vector("list")
  for(el in clust)
  {
    myplot = myplot+ geom_point(data = el, aes(y = latitude, x = longitude),color= count)
    
    p =ggplot()+ background_image(img)+geom_point(data = el, aes(y = latitude, x = longitude),color= count) + xlim(min_long, max_long) + ylim(min_lat,max_lat) + ggtitle(paste0("Cluster: ", count," of ", name  ))
    
    l$clust_plot[[as.character(count)]] = p
    
    l$summary[[as.character(count)]] = summary(el)

    count= count+1
  }
  l$myplot = myplot
  return(l)
}

get_all_cluster = function(clust_data, clust_num, dim)
{
  lis = vector("list")
  plots = vector("list")
  
  cnt= 1
  for(sub in names(clust_data))
  {
    
    lis[[sub]] = get_clusters(clust_data[[sub]], clust_num, dim, sub)
    plots[[cnt]] = lis[[sub]]$myplot
    cnt = cnt+1
    
  }
  lis[["all_plots"]]= plots
  return(lis)  
}

borders = c(min(clust_data$all$latitude) ,max(clust_data$all$latitude), min(clust_data$all$longitude), max(clust_data$all$longitude))
all_cluster = get_all_cluster(clust_data,5, borders)
## # NAs in variables:
##  latitude longitude room_type     price 
##         0         0         0         0 
## 0 observation(s) with NAs.
## 
## Estimated lambda: 0.8287906 
## 
## # NAs in variables:
##  latitude longitude room_type     price 
##         0         0         0         0 
## 0 observation(s) with NAs.
## 
## Estimated lambda: 1.272312 
## 
## # NAs in variables:
##  latitude longitude room_type     price 
##         0         0         0         0 
## 0 observation(s) with NAs.
## 
## Estimated lambda: 1.701065 
## 
## # NAs in variables:
##  latitude longitude room_type     price 
##         0         0         0         0 
## 0 observation(s) with NAs.
## 
## Estimated lambda: 0.935192 
## 
## # NAs in variables:
##  latitude longitude room_type     price 
##         0         0         0         0 
## 0 observation(s) with NAs.
## 
## Estimated lambda: 0.7781472 
## 
## # NAs in variables:
## neighbourhood_group            latitude           longitude           room_type 
##                   0                   0                   0                   0 
##               price 
##                   0 
## 0 observation(s) with NAs.
## 
## Estimated lambda: 1.747748
multiplots <- function(plotlist, file=NULL,cols = 2, layout = NULL) {
  require(grid)
  
  plots <- c(plotlist)
  
  numPlots = length(plots)
  
  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots/cols)),
                     ncol = cols, nrow = ceiling(numPlots/cols),byrow =T)
  }
  
  if (numPlots == 1) {
    print(plots[[1]])
    
  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
    
    for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout == i, arr.ind = TRUE))
      
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
                                      layout.pos.col = matchidx$col))
    }
  }
}

all_cluster$all_plots[[21]]

all_cluster$all$summary
## $`1`
##     neighbourhood_group    latitude         longitude        
##  Brooklyn     :9124     Min.   :-2.9118   Min.   :-2.813579  
##  Manhattan    :   0     1st Qu.:-1.0252   1st Qu.:-0.337869  
##  Queens       : 809     Median :-0.7689   Median :-0.045750  
##  Staten Island:  10     Mean   :-0.7509   Mean   :-0.005138  
##  Bronx        :   0     3rd Qu.:-0.3489   3rd Qu.: 0.307583  
##                         Max.   : 0.7754   Max.   : 4.680280  
##            room_type        price         
##  Private room   : 294   Min.   :-1.32482  
##  Entire home/apt:9564   1st Qu.:-0.35924  
##  Shared room    :  85   Median : 0.08379  
##                         Mean   : 0.18155  
##                         3rd Qu.: 0.54954  
##                         Max.   : 4.18465  
## 
## $`2`
##     neighbourhood_group    latitude         longitude       
##  Brooklyn     :9691     Min.   :-2.9821   Min.   :-1.27977  
##  Manhattan    :   0     1st Qu.:-0.9275   1st Qu.:-0.04332  
##  Queens       :3275     Median :-0.6285   Median : 0.35279  
##  Staten Island:   0     Mean   :-0.6364   Mean   : 0.72043  
##  Bronx        :   0     3rd Qu.:-0.3030   3rd Qu.: 0.96288  
##                         Max.   : 0.9290   Max.   : 5.16264  
##            room_type         price        
##  Private room   :11690   Min.   :-1.3248  
##  Entire home/apt:  799   1st Qu.:-0.9499  
##  Shared room    :  477   Median :-0.8136  
##                          Mean   :-0.7313  
##                          3rd Qu.:-0.5864  
##                          Max.   : 3.0487  
## 
## $`3`
##     neighbourhood_group    latitude          longitude      
##  Brooklyn     :  583    Min.   :-2.25573   Min.   :-3.7011  
##  Manhattan    :10563    1st Qu.:-0.01754   1st Qu.:-0.9270  
##  Queens       :  288    Median : 0.34290   Median :-0.6883  
##  Staten Island:    9    Mean   : 0.37664   Mean   :-0.6005  
##  Bronx        :   23    3rd Qu.: 0.69452   3rd Qu.:-0.3315  
##                         Max.   : 2.84824   Max.   : 3.7225  
##            room_type         price        
##  Private room   :  448   Min.   :-0.4160  
##  Entire home/apt:10975   1st Qu.: 0.4359  
##  Shared room    :   43   Median : 0.8903  
##                          Mean   : 1.2141  
##                          3rd Qu.: 1.8559  
##                          Max.   : 4.1847  
## 
## $`4`
##     neighbourhood_group    latitude          longitude      
##  Brooklyn     : 460     Min.   :-4.18085   Min.   :-6.3324  
##  Manhattan    :4434     1st Qu.:-0.25722   1st Qu.:-1.0413  
##  Queens       :  25     Median :-0.02293   Median :-0.8472  
##  Staten Island: 347     Mean   :-0.13418   Mean   :-1.0240  
##  Bronx        :   0     3rd Qu.: 0.37603   3rd Qu.:-0.7135  
##                         Max.   : 0.98828   Max.   : 0.1238  
##            room_type        price        
##  Private room   :4126   Min.   :-1.3248  
##  Entire home/apt: 885   1st Qu.:-0.6432  
##  Shared room    : 255   Median :-0.3706  
##                         Mean   :-0.3517  
##                         3rd Qu.:-0.1320  
##                         Max.   : 1.9127  
## 
## $`5`
##     neighbourhood_group    latitude        longitude        
##  Brooklyn     :   0     Min.   :0.3207   Min.   :-0.681685  
##  Manhattan    :5880     1st Qu.:1.0551   1st Qu.:-0.001138  
##  Queens       :1235     Median :1.4429   Median : 0.206028  
##  Staten Island:   0     Mean   :1.4809   Mean   : 0.365540  
##  Bronx        :1059     3rd Qu.:1.8254   3rd Qu.: 0.551628  
##                         Max.   :3.3632   Max.   : 3.731570  
##            room_type        price        
##  Private room   :5609   Min.   :-1.3135  
##  Entire home/apt:2280   1st Qu.:-0.8250  
##  Shared room    : 285   Median :-0.5864  
##                         Mean   :-0.5374  
##                         3rd Qu.:-0.3592  
##                         Max.   : 2.0149
multiplots(all_cluster$all_plot[1:5], cols=3)
## Loading required package: grid

multiplots(all_cluster$all_plot[6:11], cols=3)

multiplots(all_cluster$all_plot[12:17], cols=3)

multiplots(all_cluster$all_plot[c(18:20,22,23)], cols=3)

## NULL
## NULL

 

Hierarchical Cluster Analysis

agg = aggregate(price ~neighbourhood_group+room_type, clust_data$all , mean)


name_hc = c()
for (n1 in substr(unique(agg$neighbourhood_group),1,5))
{
  for(n2 in substr(unique(agg$room_type),1,3))
  {
    name_hc = c(name_hc, paste0(n1,"/",n2))
  }
}
rownames(agg) = name_hc

agg
gower <- daisy(agg, metric = "gower")
hc1 <- hclust(gower, method = "complete" )

plot(hc1, cex = 0.6, hang = -1)

avg_dend_obj <- as.dendrogram(hc1)
avg_col_dend <- color_branches(avg_dend_obj, h = 0.6)
plot(avg_col_dend)

agg = aggregate(price ~neighbourhood_group, clust_data$all , mean)
agg
rownames(agg) = c("Brooklyn","Manhattan",
                  "Queens","Staten Island", "Bronx")
agg$neighbourhood_group = NULL
gower <- daisy(agg, metric = "gower")
hc1 <- hclust(gower, method = "complete" )

plot(hc1, cex = 0.6, hang = -1)

 

Principal Component Analysis

 

PCAmixdata

## Split mixed dataset into quantitative and qualitative variables
#split <- splitmix(dataset[1:5])

split = splitmix(clust_data$all)
## PCA
res.pcamix <- PCAmix(X.quanti=split$X.quanti,  
                     X.quali=split$X.quali, 
                     rename.level=TRUE)

res.pcamix
## 
## Call:
## PCAmix(X.quanti = split$X.quanti, X.quali = split$X.quali, rename.level = TRUE)
## 
## Method = Principal Component of mixed data (PCAmix)
## 
## 
## "name" "description"
## "$eig" "eigenvalues of the principal components (PC) "
## "$ind" "results for the individuals (coord,contrib,cos2)"
## "$quanti" "results for the quantitative variables (coord,contrib,cos2)"
## "$levels" "results for the levels of the qualitative variables (coord,contrib,cos2)"
## "$quali" "results for the qualitative variables (contrib,relative contrib)"
## "$sqload" "squared loadings"
## "$coef" "coef of the linear combinations defining the PC"
## Inspect principal components
res.pcamix$eig
##       Eigenvalue Proportion Cumulative
## dim 1  2.1303801  23.670890   23.67089
## dim 2  1.8046238  20.051375   43.72227
## dim 3  1.2456603  13.840670   57.56294
## dim 4  1.0028761  11.143067   68.70600
## dim 5  0.9971611  11.079568   79.78557
## dim 6  0.9570021  10.633357   90.41893
## dim 7  0.4201255   4.668061   95.08699
## dim 8  0.2652652   2.947391   98.03438
## dim 9  0.1769058   1.965620  100.00000
res.pcamix$quanti.cor
##                dim 1      dim 2      dim 3       dim 4       dim 5
## latitude  -0.1972875 0.89428404 -0.2214475 -0.04100129 -0.04272996
## longitude  0.7617791 0.35537824  0.4064136  0.01097103  0.01298250
## price     -0.7183816 0.07289152  0.4855170  0.02253704  0.01357485
res.pcamix$quali.eta2
##                         dim 1       dim 2     dim 3     dim 4     dim 5
## neighbourhood_group 0.6390071 0.871714919 0.4342793 0.5273305 0.4806940
## room_type           0.3560712 0.001558062 0.3614432 0.4732362 0.5142885

 

Factor Analysis of Mixed Data (FAMD)

FAMD (base, ncp = 5, sup.var = NULL, ind.sup = NULL, graph = TRUE) - base : a data frame with n rows (individuals) and p columns (variables). - ncp: the number of dimensions kept in the results (by default 5) - sup.var: a vector indicating the indexes of the supplementary variables. - ind.sup: a vector indicating the indexes of the supplementary individuals. - graph : a logical value. If TRUE a graph is displayed.

res.famd <- FAMD(clust_data$all, graph = F, ncp = 5)
print(res.famd)
## *The results are available in the following objects:
## 
##   name          description                             
## 1 "$eig"        "eigenvalues and inertia"               
## 2 "$var"        "Results for the variables"             
## 3 "$ind"        "results for the individuals"           
## 4 "$quali.var"  "Results for the qualitative variables" 
## 5 "$quanti.var" "Results for the quantitative variables"
eig.val <- get_eigenvalue(res.famd)
head(eig.val)
##       eigenvalue variance.percent cumulative.variance.percent
## Dim.1  2.1303801         23.67089                    23.67089
## Dim.2  1.8046238         20.05138                    43.72227
## Dim.3  1.2456603         13.84067                    57.56294
## Dim.4  1.0028761         11.14307                    68.70600
## Dim.5  0.9971611         11.07957                    79.78557
fviz_screeplot(res.famd)

quanti.var <- get_famd_var(res.famd, "quanti.var")
quanti.var
## FAMD results for quantitative variables 
##  ===================================================
##   Name       Description                      
## 1 "$coord"   "Coordinates"                    
## 2 "$cos2"    "Cos2, quality of representation"
## 3 "$contrib" "Contributions"
fviz_famd_var(res.famd, "quanti.var", col.var = "contrib", 
              gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07"),
              repel = TRUE)

quali.var <- get_famd_var(res.famd, "quali.var")
quali.var 
## FAMD results for qualitative variable categories 
##  ===================================================
##   Name       Description                      
## 1 "$coord"   "Coordinates"                    
## 2 "$cos2"    "Cos2, quality of representation"
## 3 "$contrib" "Contributions"
fviz_famd_var(res.famd, "quali.var", col.var = "contrib", 
              gradient.cols = c("#00AFBB", "#E7B800", "#FC4E07")
)

var <- get_famd_var(res.famd)
var
## FAMD results for variables 
##  ===================================================
##   Name       Description                      
## 1 "$coord"   "Coordinates"                    
## 2 "$cos2"    "Cos2, quality of representation"
## 3 "$contrib" "Contributions"
# Coordinates of variables
head(var$coord)
##                          Dim.1       Dim.2      Dim.3        Dim.4        Dim.5
## latitude            0.03892236 0.799743946 0.04903898 0.0016811055 0.0018258497
## longitude           0.58030735 0.126293692 0.16517205 0.0001203636 0.0001685452
## price               0.51607212 0.005313173 0.23572672 0.0005079180 0.0001842764
## neighbourhood_group 0.63900709 0.871714919 0.43427934 0.5273304673 0.4806939587
## room_type           0.35607121 0.001558062 0.36144319 0.4732362202 0.5142884950
# Cos2: quality of representation on the factore map
head(var$cos2)
##                          Dim.1        Dim.2       Dim.3        Dim.4
## latitude            0.00151495 6.395904e-01 0.002404821 2.826116e-06
## longitude           0.33675663 1.595010e-02 0.027281805 1.448739e-08
## price               0.26633043 2.822981e-05 0.055567086 2.579807e-07
## neighbourhood_group 0.10208251 1.899717e-01 0.047149637 6.951936e-02
## room_type           0.06339335 1.213779e-06 0.065320588 1.119763e-01
##                            Dim.5
## latitude            3.333727e-06
## longitude           2.840749e-08
## price               3.395780e-08
## neighbourhood_group 5.776667e-02
## room_type           1.322463e-01
# Contributions to the  dimensions
head(var$contrib)
##                         Dim.1       Dim.2     Dim.3       Dim.4       Dim.5
## latitude             1.827015 44.31638047  3.936786  0.16762844  0.18310478
## longitude           27.239615  6.99833910 13.259799  0.01200184  0.01690250
## price               24.224415  0.29441998 18.923837  0.05064614  0.01848011
## neighbourhood_group 29.994980 48.30452324 34.863386 52.58181750 48.20624738
## room_type           16.713975  0.08633722 29.016193 47.18790608 51.57526523
# Plot of variables

fviz_famd_var(res.famd, repel = TRUE)

# Contribution to the first dimension
fviz_contrib(res.famd, "var", axes = 1)

# Contribution to the second dimension
fviz_contrib(res.famd, "var", axes = 2)

# Contribution to the third dimension
fviz_contrib(res.famd, "var", axes = 3)

# Contribution to the forth dimension
fviz_contrib(res.famd, "var", axes = 4)

# Contribution to the fifth dimension
fviz_contrib(res.famd, "var", axes = 5)